############################################################
#This code for the quality control spatiotemporal dtatset
#By Ahmed Hemedan
################################################################
# Quality control

# Create quality report for the first dataset
minimalSet <- ExpressionSet(assayData=as.matrix(data2filt))
arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_data2", force = TRUE, do.logtransform = FALSE)


# Create quality report for the second dataset
minimalSet <- ExpressionSet(assayData=as.matrix(data1filt))
arrayQualityMetrics(expressionset = minimalSet, outdir = "Quality_Report_data1", force = TRUE, do.logtransform = FALSE)


# remove all samples failing at least two quality tests

remove_samples_data2 = match(c("GSM606624","GSM606625","GSM606626"),colnames(data2filt))
data2filt2 = data2filt[,-remove_samples_data2]
data2_outcome_final = data2_outcomefilt[-remove_samples_data2]

# data1 dataset: no failing samples to remove
data1_outcome_final = data1_outcomefilt


# Manual outlier check

# detect outliers via hierarchical clustering

distmat = dist(t(data2filt2))

hcldat = hclust(distmat, method="average")

plot(hcldat)

# detect outliers via PCoA

medianscale <- cmdscale(dist(t(data2filt2)), k = 2)

plot(medianscale[,1], medianscale[,2], col=rainbow(2)[match(data2_outcome_final, unique(data2_outcome_final))], pch=20, main="PCoA plot", labels=NULL, cex=2, cex.axis=0.1, tck=0, xlab="Dimension 1", ylab="Dimension 2")

distmat = dist(t(data1filt))

hcldat = hclust(distmat, method="average")

plot(hcldat)

# detect outliers via PCoA

medianscale <- cmdscale(dist(t(data1filt)), k = 2)

plot(medianscale[,1], medianscale[,2], col=rainbow(2)[match(data1_outcome_final, unique(data1_outcome_final))], pch=20, main="PCoA plot", labels=NULL, cex=2, cex.axis=0.1, tck=0, xlab="Dimension 1", ylab="Dimension 2")


# Data transformation using Variance stabilising normalization (VSN)

# check for intensity-dependent variance
meanSdPlot(as.matrix(data2filt2))

data2vsn = exprs(vsn2(as.matrix(data2filt2)))

# verify the fit
meanSdPlot(data2vsn)


# check for intensity-dependent variance: data1 dataset
meanSdPlot(as.matrix(data1filt))
# yes, variance dependence on average intensity -- apply VSN transformation

data1vsn = exprs(vsn2(as.matrix(data1filt)))
data1vsn 
# verify the fit
meanSdPlot(data1vsn)

# Power calculation

# outcome "y" for two unpaired classes must be numeric labels 1, 2
data = list(x=data2vsn, y=ifelse(data2_outcome_final=="disease state: Control",1,2), geneid=as.character(1:nrow(data2vsn)),genenames=paste("g",as.character(1:nrow(data2vsn)),sep=""), logged2=TRUE)


# run the simulation with 1000 permutations
samr.obj <- samr(data,  resp.type="Two class unpaired", nperms=1000, random.seed=1234)


# investigate the following sample sizes of interest: 10, 20, 30, 50
colnum = ncol(data$x)
sfactors = c(10/colnum, 20/colnum, 30/colnum, 50/colnum)

# set seed value for the random number generator
set.seed(1234)


# determine power to detect 1.5-fold changes
samr.assess15 <- samr.assess.samplesize(samr.obj, data, log2(1.5), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess15)

# determine power to detect 1.1-fold changes
samr.assess11 <- samr.assess.samplesize(samr.obj, data, log2(1.1), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess11)


# data1 data - power calculation


# outcome "y" for two unpaired classes must be numeric labels 1, 2
data = list(x=data1vsn, y=ifelse(data1_outcome_final=="control",1,2), geneid=as.character(1:nrow(data2vsn)),genenames=paste("g",as.character(1:nrow(data2vsn)),sep=""), logged2=TRUE)


# run the simulation with 1000 permutations
samr.obj <- samr(data,  resp.type="Two class unpaired", nperms=1000, random.seed=1234)


# investigate the following sample sizes of interest: 10, 20, 30, 50
colnum = ncol(data$x)
sfactors = c(10/colnum, 20/colnum, 30/colnum, 50/colnum)

# set seed value for the random number generator
set.seed(1234)


# determine power to detect 1.5-fold changes
samr.assess15 <- samr.assess.samplesize(samr.obj, data, log2(1.5), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess15)

# determine power to detect 1.1-fold changes
samr.assess11 <- samr.assess.samplesize(samr.obj, data, log2(1.1), samplesize.factors=sfactors)
samr.assess.samplesize.plot(samr.assess11)
data2_outcome_final

# DEG Analysis of individual datasets

# Limma analysis of data2 dataset
data2_label = ifelse(data2_outcome_final == "disease state: Control","control","parkinson")
design <- model.matrix(~ -1+factor(data2_label))
design
colnames(design) <- unique(data2_label)
colnames(design)

# compute simple linear model fit to microarray data 
fit <- lmFit(data2vsn, design)
contrast.matrix = makeContrasts(parkinson-control, levels=design)
fit2 = contrasts.fit(fit, contrast.matrix)
eb <- eBayes(fit2)

# extract the ranking table and show the top-ranked genes
ttable_data2 <- topTable(eb, n = nrow(data2filt2)) 
head(ttable_data2)


# Limma analysis of data1 dataset
design <- model.matrix(~ -1+factor(data1_outcome_final))
design
colnames(design) <- unique(data1_outcome_final)

# compute simple linear model fit to microarray data 
fit <- lmFit(data1vsn, design)

contrast.matrix = makeContrasts(parkinson-control, levels=design)
fit2 = contrasts.fit(fit, contrast.matrix)

eb <- eBayes(fit2)

# extract the ranking table and show the top-ranked genes
ttable_data1 <- topTable(eb, n = nrow(data1filt)) 

head(ttable_data1)
data1_outcome_final

#save datasets
save(data1vsn, data1_outcome_final, file="data1_preprocessed.Rdata")
save(data2vsn, data2_outcome_final, file="data2_preprocessed.Rdata")
